pegRNA Library Comparison

Author

Lance Parsons

Read data

Get list of samples

Code
samples <- read_tsv("config/samples.tsv",
  col_types = list(
    sample_name = col_character(),
    cell_line = col_factor(),
    exogenous_rna = col_factor(),
    day = col_factor()
  )
)
units <- read_tsv("config/units.tsv",
  col_types = list(
    sample_name = col_character(),
    unit_name = col_character(),
    fq1 = col_character(),
    fq2 = col_character()
  )
)
sample_units <- dplyr::left_join(samples, units, by = "sample_name") %>%
  unite(sample_unit, sample_name, unit_name, remove = FALSE) %>%
  mutate(
    day = as.factor(paste0("day", day)),
    batch = as.factor(case_when(
      exogenous_rna == "mastermix1" ~ "batch3",
      exogenous_rna == "mastermix2" ~ "batch3",
      TRUE ~ "batch2"
    )),
    .before = cell_line,
  )

sample_units

Table of exogenous RNA mixtures

Code
# Exogenous RNA mixtures
rna_mixes <- tibble()
for (mix in c("mastermix1", "mastermix2", "PJY103_mDNMT1", "PJY300_mDNMT1")) {
  t <- Biostrings::readDNAStringSet(sprintf("data/references/%s.fa", mix))
  rna_mixes <- rbind(rna_mixes, tibble(
    exogenous_rna = mix,
    rna_species = word(t@ranges@NAMES, 1),
    start = 1,
    end = t@ranges@width
  ))
}
rna_mixes

Biotype comparison

  • count only fragments that were properly aligned
  • annotate with GENCODE gene model
  • primary alignments were counted, even if the fragments aligned multiple times
  • fragments aligning to multiple features were assigned to the feature that mostly closely overlapped with the fragment
Code
human_counts_dir <- "results/smrna_count/"
biotype_counts_files <- paste0(
  human_counts_dir,
  sample_units$sample_unit,
  "_first_proper_pair_biotype_count.txt"
)
exogenous_counts_dir <- "results/exogenous_rna_count/"
exogenous_counts_files <- paste0(
  exogenous_counts_dir,
  sample_units$sample_unit,
  "_idxstats.txt"
)

biotype_counts <- readr::read_tsv(
  biotype_counts_files[1],
  comment = "#",
  col_names = c("biotype", biotype_counts_files[1]),
  col_types = "ci"
)
exogenous_counts <- read_tsv(
  exogenous_counts_files[1],
  col_names = c("exogenous_rna", exogenous_counts_files[1]),
  col_types = "c-i-"
)
biotype_counts <- biotype_counts %>%
  dplyr::add_row(
    biotype = "exogenous_rna",
    "{biotype_counts_files[1]}" := sum(exogenous_counts[[exogenous_counts_files[1]]])
  )

for (i in 2:length(biotype_counts_files)) {
  biotype_sample <-
    readr::read_tsv(
      biotype_counts_files[i],
      comment = "#",
      col_names = c("biotype", biotype_counts_files[i]),
      col_types = "ci"
    )

  exogenous_counts_sample <-
    readr::read_tsv(
      exogenous_counts_files[i],
      col_names = c("exogenous_rna", exogenous_counts_files[i]),
      col_types = "c-i-"
    )

  biotype_sample <- biotype_sample %>%
    dplyr::add_row(
      biotype = "exogenous_rna",
      "{biotype_counts_files[i]}" :=
        sum(exogenous_counts_sample[[exogenous_counts_files[i]]])
    )

  biotype_counts <- biotype_counts %>%
    dplyr::full_join(biotype_sample, by = "biotype")
}

biotype_counts <- biotype_counts %>%
  rename_all(~ stringr::str_replace_all(
    ., human_counts_dir, ""
  )) %>%
  rename_all(~ stringr::str_replace_all(., "_first_proper_pair_biotype_count.txt", "")) %>%
  tidyr::pivot_longer(!biotype, names_to = "sample", values_to = "count")

biotype_counts
Code
p <- ggplot(
  data = subset(biotype_counts, !is.na(count)),
  aes(x = sample, y = count, fill = biotype)
) +
  geom_bar(stat = "identity", position = "fill") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
p

small RNA gene counts

  • count only fragments that were properly aligned
  • annotate with GENCODE gene model
  • primary alignments were counted, even if the fragments aligned multiple times
  • fragments aligning to multiple features were assigned to the feature that mostly closely overlapped with the fragment
  • exogenous RNA counts are total fragments that aligned
Code
human_counts_dir <- "results/smrna_count/"
gene_counts_files <- paste0(
  human_counts_dir,
  sample_units$sample_unit,
  "_first_proper_pair_gene_count.txt"
)

exogenous_counts_dir <- "results/exogenous_rna_count/"
exogenous_counts_files <- paste0(
  exogenous_counts_dir,
  sample_units$sample_unit,
  "_idxstats.txt"
)

gene_counts <- readr::read_tsv(
  gene_counts_files[1],
  comment = "#",
  col_names = c("gene", gene_counts_files[1]),
  col_types = "ci"
)
exogenous_counts <- read_tsv(
  exogenous_counts_files[1],
  col_names = c("gene", gene_counts_files[1]),
  col_types = "c-i-"
)
gene_counts <- rbind(gene_counts, exogenous_counts)

for (i in 2:length(gene_counts_files)) {
  gene_sample <-
    readr::read_tsv(
      gene_counts_files[i],
      comment = "#",
      col_names = c("gene", gene_counts_files[i]),
      col_types = "ci"
    )
  exogenous_counts_sample <- read_tsv(
    exogenous_counts_files[i],
    col_names = c("gene", gene_counts_files[i]),
    col_types = "c-i-"
  )
  gene_sample <- rbind(gene_sample, exogenous_counts_sample)
  gene_counts <- gene_counts %>%
    dplyr::full_join(gene_sample, by = "gene")
}

gene_counts <- gene_counts %>%
  rename_all(~ stringr::str_replace_all(
    ., human_counts_dir, ""
  )) %>%
  rename_all(~ stringr::str_replace_all(., "_first_proper_pair_gene_count.txt", ""))

gene_counts
Code
# List of exogenous genes to highlight
exogenous_rna_names <- gene_counts %>%
  dplyr::filter(str_detect(gene, "^PJY")) %>%
  pull(gene)

Import sample metadata and counts into DESeq2

Read these counts into DESeq2 along with the sample metadata.

Set the design to ~ exogenous_rna + day + cell_line.

Code
dds <- DESeqDataSetFromMatrix(
  countData = gene_counts %>%
    tibble::column_to_rownames("gene") %>%
    replace(is.na(.), 0),
  colData = sample_units,
  design = ~ exogenous_rna + day + cell_line
)
converting counts to integer mode
Code
dds
class: DESeqDataSet 
dim: 56701 56 
metadata(1): version
assays(1): counts
rownames(56701): GAS5 SNORD30 ... MIR6134 ENSG00000270413
rowData names(0):
colnames(56): Parental_mastermix1_day1_rep1
  Parental_mastermix1_day1_rep2 ... Parental_PJY300_epegRNA_day2_rep3
  Parental_PJY300_epegRNA_day2_rep4
colData names(9): sample_unit sample_name ... fq1 fq2

Sample comparisons

Variance Stabalized Transformation

Code
vsd <- vst(dds, blind = FALSE)

Heatmap of sample-to-sample distances

Code
sample_dists <- dist(t(assay(vsd)))

sample_dist_matrix <- as.matrix(sample_dists)
rownames(sample_dist_matrix) <- paste(vsd$cell_line,
  vsd$exogenous_rna,
  vsd$day,
  sep = "-"
)
colnames(sample_dist_matrix) <- NULL
Code
colors <- colorRampPalette(rev(brewer.pal(9, "Blues")))(255)
pheatmap(sample_dist_matrix,
  clustering_distance_rows = sample_dists,
  clustering_distance_cols = sample_dists,
  col = colors
)

PCA plot of samples

Code
plotPCA(vsd, intgroup = c("cell_line", "exogenous_rna", "day"))

Differetial Expression

Batch 2 - Day 1

Select subset of samples then calculate the log2 fold change: cell line P1E10 vs Parental

Code
dds_batch2_day1 <- subset(dds, select = colData(dds)$batch == "batch2" &
  colData(dds)$day == "day1")
dds_batch2_day1$exogenous_rna <- droplevels(dds_batch2_day1$exogenous_rna)
dds_batch2_day1$day <- droplevels(dds_batch2_day1$day)
design(dds_batch2_day1) <- ~ exogenous_rna + cell_line
dds_batch2_day1 <- DESeq(dds_batch2_day1, parallel = TRUE)
estimating size factors
estimating dispersions
gene-wise dispersion estimates: 4 workers
mean-dispersion relationship
final dispersion estimates, fitting model and testing: 4 workers
Code
dds_batch2_day1
class: DESeqDataSet 
dim: 56701 16 
metadata(1): version
assays(4): counts mu H cooks
rownames(56701): GAS5 SNORD30 ... MIR6134 ENSG00000270413
rowData names(26): baseMean baseVar ... deviance maxCooks
colnames(16): P1E10_sorted_PJY103_pegRNA_day1_rep1
  P1E10_sorted_PJY103_pegRNA_day1_rep2 ...
  Parental_PJY300_epegRNA_day1_rep3 Parental_PJY300_epegRNA_day1_rep4
colData names(10): sample_unit sample_name ... fq2 sizeFactor
Code
res_batch2_day1 <- results(dds_batch2_day1, alpha = 0.05)
res_batch2_day1
log2 fold change (MLE): cell line P1E10 vs Parental 
Wald test p-value: cell line P1E10 vs Parental 
DataFrame with 56701 rows and 6 columns
                 baseMean log2FoldChange     lfcSE      stat      pvalue
                <numeric>      <numeric> <numeric> <numeric>   <numeric>
GAS5              3358381      0.1558718 0.0335180  4.650395 3.31299e-06
SNORD30           1605167      0.0910242 0.0576353  1.579315 1.14264e-01
SNORD49A          1156164      0.1842127 0.0436209  4.223034 2.41035e-05
SNORD26            857547      0.0954226 0.0539739  1.767939 7.70710e-02
SNHG1              780897     -0.0217632 0.0400106 -0.543936 5.86486e-01
...                   ...            ...       ...       ...         ...
ENSG00000275017         0             NA        NA        NA          NA
ENSG00000214669         0             NA        NA        NA          NA
ENSG00000276467         0             NA        NA        NA          NA
MIR6134                 0             NA        NA        NA          NA
ENSG00000270413         0             NA        NA        NA          NA
                       padj
                  <numeric>
GAS5            7.59058e-05
SNORD30         4.51373e-01
SNORD49A        4.75941e-04
SNORD26         3.61350e-01
SNHG1           8.79543e-01
...                     ...
ENSG00000275017          NA
ENSG00000214669          NA
ENSG00000276467          NA
MIR6134                  NA
ENSG00000270413          NA
Code
summary(res_batch2_day1)

out of 51617 with nonzero total read count
adjusted p-value < 0.05
LFC > 0 (up)       : 936, 1.8%
LFC < 0 (down)     : 1747, 3.4%
outliers [1]       : 1, 0.0019%
low counts [2]     : 23893, 46%
(mean count < 1)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results

Log fold change shrinkage

Run log fold change shrinkage to account for low counts.

Code
res_batch2_day1_lfc <- lfcShrink(dds_batch2_day1,
  coef = "cell_line_P1E10_vs_Parental",
  type = "apeglm", parallel = TRUE
)
using 'apeglm' for LFC shrinkage. If used in published research, please cite:
    Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
    sequence count data: removing the noise and preserving large differences.
    Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895

MA Plot (lfc shrunk)

MA Plot of shrunken log2 fold changes

Code
res_batch2_day1_lfc_df <- res_batch2_day1_lfc %>%
  as.data.frame() %>%
  rownames_to_column("gene") %>%
  dplyr::mutate(significant = padj < 0.05)

write_tsv(res_batch2_day1_lfc_df, file = "diffexp_results_batch2_day1.tsv")

res_batch2_day1_lfc_labelled <- res_batch2_day1_lfc_df %>%
  dplyr::filter(gene %in% exogenous_rna_names) %>%
  na.omit()

ggplot(
  res_batch2_day1_lfc_df,
  aes(x = log2(baseMean), y = log2FoldChange, colour = significant)
) +
  geom_point() +
  ggrepel::geom_label_repel(
    data = res_batch2_day1_lfc_labelled,
    aes(label = gene, segment.colour = "black"),
    min.segment.length = 0
  ) +
  theme_bw()
Warning: Removed 5084 rows containing missing values (geom_point).

Counts of exogenous rna

Plot of counts for a single gene (with lowest adjusted p-value)

Code
plot_list <- list()
for (gene in res_batch2_day1_lfc_labelled$gene) {
  gene_prefix <- str_split(gene, "_")[[1]][1]
  d <- plotCounts(dds_batch2_day1,
    gene = gene,
    intgroup = "cell_line",
    returnData = TRUE
  )
  p <- ggplot(
    d %>%
      rownames_to_column("sample_unit") %>%
      dplyr::filter(grepl(gene_prefix, sample_unit)),
    aes(x = cell_line, y = count)
  ) +
    geom_point(position = position_jitter(width = 0.1, height = 0)) +
    ggtitle(gene)
  plot_list[[gene]] <- p
}
cowplot::plot_grid(plotlist = plot_list)

Batch 2 - Day 2

Select subset of samples then calculate the log2 fold change: cell line P1E10 vs Parental

Code
dds_batch2_day2 <- subset(dds, select = colData(dds)$batch == "batch2" &
  colData(dds)$day == "day2")
dds_batch2_day2$exogenous_rna <- droplevels(dds_batch2_day2$exogenous_rna)
dds_batch2_day2$day <- droplevels(dds_batch2_day2$day)
design(dds_batch2_day2) <- ~ exogenous_rna + cell_line
dds_batch2_day2 <- DESeq(dds_batch2_day2, parallel = TRUE)
estimating size factors
estimating dispersions
gene-wise dispersion estimates: 4 workers
mean-dispersion relationship
final dispersion estimates, fitting model and testing: 4 workers
Code
dds_batch2_day2
class: DESeqDataSet 
dim: 56701 16 
metadata(1): version
assays(4): counts mu H cooks
rownames(56701): GAS5 SNORD30 ... MIR6134 ENSG00000270413
rowData names(26): baseMean baseVar ... deviance maxCooks
colnames(16): P1E10_sorted_PJY103_pegRNA_day2_rep1
  P1E10_sorted_PJY103_pegRNA_day2_rep2 ...
  Parental_PJY300_epegRNA_day2_rep3 Parental_PJY300_epegRNA_day2_rep4
colData names(10): sample_unit sample_name ... fq2 sizeFactor
Code
res_batch2_day2 <- results(dds_batch2_day2, alpha = 0.05)
res_batch2_day2
log2 fold change (MLE): cell line P1E10 vs Parental 
Wald test p-value: cell line P1E10 vs Parental 
DataFrame with 56701 rows and 6 columns
                 baseMean log2FoldChange     lfcSE       stat      pvalue
                <numeric>      <numeric> <numeric>  <numeric>   <numeric>
GAS5              3192415     0.18567762 0.0302504    6.13802 8.35556e-10
SNORD30           1514330     0.15582664 0.0510950    3.04974 2.29038e-03
SNORD49A          1170290     0.19302857 0.0427120    4.51931 6.20425e-06
SNORD26            825436     0.09512923 0.0397356    2.39406 1.66632e-02
SNHG1              678111     0.00437699 0.0310755    0.14085 8.87988e-01
...                   ...            ...       ...        ...         ...
ENSG00000275017 0.0500828      -0.164237   3.06038 -0.0536654    0.957202
ENSG00000214669 0.0500828      -0.164237   3.06038 -0.0536654    0.957202
ENSG00000276467 0.0500828      -0.164237   3.06038 -0.0536654    0.957202
MIR6134         0.0500828      -0.164237   3.06038 -0.0536654    0.957202
ENSG00000270413 0.0500828      -0.164237   3.06038 -0.0536654    0.957202
                       padj
                  <numeric>
GAS5            3.08070e-08
SNORD30         2.74383e-02
SNORD49A        1.36212e-04
SNORD26         1.33265e-01
SNHG1           9.77319e-01
...                     ...
ENSG00000275017          NA
ENSG00000214669          NA
ENSG00000276467          NA
MIR6134                  NA
ENSG00000270413          NA
Code
summary(res_batch2_day2)

out of 50450 with nonzero total read count
adjusted p-value < 0.05
LFC > 0 (up)       : 876, 1.7%
LFC < 0 (down)     : 1590, 3.2%
outliers [1]       : 53, 0.11%
low counts [2]     : 24293, 48%
(mean count < 1)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results

Log fold change shrinkage

Run log fold change shrinkage to account for low counts.

Code
res_batch2_day2_lfc <- lfcShrink(dds_batch2_day2,
  coef = "cell_line_P1E10_vs_Parental",
  type = "apeglm", parallel = TRUE
)
using 'apeglm' for LFC shrinkage. If used in published research, please cite:
    Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
    sequence count data: removing the noise and preserving large differences.
    Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895

MA Plot (lfc shrunk)

MA Plot of shrunken log2 fold changes

Code
res_batch2_day2_lfc_df <- res_batch2_day2_lfc %>%
  as.data.frame() %>%
  rownames_to_column("gene") %>%
  dplyr::mutate(significant = padj < 0.05)

write_tsv(res_batch2_day2_lfc_df, file = "diffexp_results_batch2_day2.tsv")

res_batch2_day2_lfc_labelled <- res_batch2_day2_lfc_df %>%
  dplyr::filter(gene %in% exogenous_rna_names) %>%
  na.omit()

ggplot(
  res_batch2_day2_lfc_df,
  aes(x = log2(baseMean), y = log2FoldChange, color = significant)
) +
  geom_point() +
  ggrepel::geom_label_repel(
    data = res_batch2_day2_lfc_labelled,
    aes(label = gene, segment.colour = "black"),
    min.segment.length = 0
  ) +
  theme_bw()
Warning: Removed 6251 rows containing missing values (geom_point).

Plot of counts for a single gene (with lowest adjusted p-value)

Code
plot_list <- list()
for (gene in res_batch2_day2_lfc_labelled$gene) {
  gene_prefix <- str_split(gene, "_")[[1]][1]
  d <- plotCounts(dds_batch2_day2,
    gene = gene,
    intgroup = "cell_line",
    returnData = TRUE
  )
  p <- ggplot(
    d %>%
      rownames_to_column("sample_unit") %>%
      dplyr::filter(grepl(gene_prefix, sample_unit)),
    aes(x = cell_line, y = count)
  ) +
    geom_point(position = position_jitter(width = 0.1, height = 0)) +
    ggtitle(gene)
  plot_list[[gene]] <- p
}
cowplot::plot_grid(plotlist = plot_list)

Batch 3 - Day 1

Select subset of samples then calculate the log2 fold change: cell line P1E10 vs Parental

Code
dds_batch3_day1 <- subset(dds, select = colData(dds)$batch == "batch3" &
  colData(dds)$day == "day1")
dds_batch3_day1$exogenous_rna <- droplevels(dds_batch3_day1$exogenous_rna)
dds_batch3_day1$day <- droplevels(dds_batch3_day1$day)
design(dds_batch3_day1) <- ~ exogenous_rna + cell_line
dds_batch3_day1 <- DESeq(dds_batch3_day1, parallel = TRUE)
estimating size factors
estimating dispersions
gene-wise dispersion estimates: 4 workers
mean-dispersion relationship
final dispersion estimates, fitting model and testing: 4 workers
Code
dds_batch3_day1
class: DESeqDataSet 
dim: 56701 12 
metadata(1): version
assays(4): counts mu H cooks
rownames(56701): GAS5 SNORD30 ... MIR6134 ENSG00000270413
rowData names(26): baseMean baseVar ... deviance maxCooks
colnames(12): Parental_mastermix1_day1_rep1
  Parental_mastermix1_day1_rep2 ... P1E10_sorted_mastermix2_day1_rep2
  P1E10_sorted_mastermix2_day1_rep3
colData names(10): sample_unit sample_name ... fq2 sizeFactor
Code
res_batch3_day1 <- results(dds_batch3_day1, alpha = 0.05)
res_batch3_day1
log2 fold change (MLE): cell line P1E10 vs Parental 
Wald test p-value: cell line P1E10 vs Parental 
DataFrame with 56701 rows and 6 columns
                 baseMean log2FoldChange     lfcSE      stat      pvalue
                <numeric>      <numeric> <numeric> <numeric>   <numeric>
GAS5              3961014       0.244497 0.0338285  7.227526 4.91872e-13
SNORD30           1821914       0.258996 0.0383083  6.760842 1.37192e-11
SNORD49A          1511998       0.249781 0.0372246  6.710113 1.94473e-11
SNORD26            978985       0.216188 0.0333114  6.489916 8.58842e-11
SNHG1              822961       0.013098 0.0370822  0.353214 7.23928e-01
...                   ...            ...       ...       ...         ...
ENSG00000275017         0             NA        NA        NA          NA
ENSG00000214669         0             NA        NA        NA          NA
ENSG00000276467         0             NA        NA        NA          NA
MIR6134                 0             NA        NA        NA          NA
ENSG00000270413         0             NA        NA        NA          NA
                       padj
                  <numeric>
GAS5            1.89349e-11
SNORD30         4.63230e-10
SNORD49A        6.49654e-10
SNORD26         2.67018e-09
SNHG1           9.29315e-01
...                     ...
ENSG00000275017          NA
ENSG00000214669          NA
ENSG00000276467          NA
MIR6134                  NA
ENSG00000270413          NA
Code
summary(res_batch3_day1)

out of 49746 with nonzero total read count
adjusted p-value < 0.05
LFC > 0 (up)       : 859, 1.7%
LFC < 0 (down)     : 1713, 3.4%
outliers [1]       : 1, 0.002%
low counts [2]     : 27764, 56%
(mean count < 2)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results

Log fold change shrinkage

Run log fold change shrinkage to account for low counts.

Code
res_batch3_day1_lfc <- lfcShrink(dds_batch3_day1,
  coef = "cell_line_P1E10_vs_Parental",
  type = "apeglm", parallel = TRUE
)
using 'apeglm' for LFC shrinkage. If used in published research, please cite:
    Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
    sequence count data: removing the noise and preserving large differences.
    Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895

MA Plot (lfc shrunk)

MA Plot of shrunken log2 fold changes

Code
res_batch3_day1_lfc_df <- res_batch3_day1_lfc %>%
  as.data.frame() %>%
  rownames_to_column("gene") %>%
  dplyr::mutate(significant = padj < 0.05)

write_tsv(res_batch3_day1_lfc_df, file = "diffexp_results_batch3_day1.tsv")

res_batch3_day1_lfc_labelled <- res_batch3_day1_lfc_df %>%
  dplyr::filter(gene %in% exogenous_rna_names) %>%
  na.omit()

ggplot(
  res_batch3_day1_lfc_df,
  aes(x = log2(baseMean), y = log2FoldChange, colour = significant)
) +
  geom_point() +
  ggrepel::geom_label_repel(
    data = res_batch3_day1_lfc_labelled,
    aes(label = gene, segment.colour = "black"),
    min.segment.length = 0
  ) +
  theme_bw()
Warning: Removed 6955 rows containing missing values (geom_point).

Counts of exogenous rna

Plot of counts for a single gene (with lowest adjusted p-value)

Code
plot_list <- list()
for (gene in res_batch3_day1_lfc_labelled$gene) {
  
  exogenous_rna <- rna_mixes %>%
    dplyr::filter(rna_species == gene) %>%
    pull(exogenous_rna)
  exogenous_rna_regex <- paste(exogenous_rna, collapse = "|")
  
  d <- plotCounts(dds_batch3_day1,
    gene = gene,
    intgroup = "cell_line",
    returnData = TRUE
  )
  p <- ggplot(
    d %>%
      rownames_to_column("sample_unit") %>%
      dplyr::filter(grepl(exogenous_rna_regex, sample_unit)),
    aes(x = cell_line, y = count)
  ) +
    geom_point(position = position_jitter(width = 0.1, height = 0)) +
    ggtitle(gene)
  plot_list[[gene]] <- p
}
cowplot::plot_grid(plotlist = plot_list, ncol = 2)

Batch 3 - Day 2

Select subset of samples then calculate the log2 fold change: cell line P1E10 vs Parental

Code
dds_batch3_day2 <- subset(dds, select = colData(dds)$batch == "batch3" &
  colData(dds)$day == "day2")
dds_batch3_day2$exogenous_rna <- droplevels(dds_batch3_day2$exogenous_rna)
dds_batch3_day2$day <- droplevels(dds_batch3_day2$day)
design(dds_batch3_day2) <- ~ exogenous_rna + cell_line
dds_batch3_day2 <- DESeq(dds_batch3_day2, parallel = TRUE)
estimating size factors
estimating dispersions
gene-wise dispersion estimates: 4 workers
mean-dispersion relationship
final dispersion estimates, fitting model and testing: 4 workers
Code
dds_batch3_day2
class: DESeqDataSet 
dim: 56701 12 
metadata(1): version
assays(4): counts mu H cooks
rownames(56701): GAS5 SNORD30 ... MIR6134 ENSG00000270413
rowData names(26): baseMean baseVar ... deviance maxCooks
colnames(12): Parental_mastermix1_day2_rep1
  Parental_mastermix1_day2_rep2 ... P1E10_sorted_mastermix2_day2_rep2
  P1E10_sorted_mastermix2_day2_rep3
colData names(10): sample_unit sample_name ... fq2 sizeFactor
Code
res_batch3_day2 <- results(dds_batch3_day2, alpha = 0.05)
res_batch3_day2
log2 fold change (MLE): cell line P1E10 vs Parental 
Wald test p-value: cell line P1E10 vs Parental 
DataFrame with 56701 rows and 6 columns
                 baseMean log2FoldChange     lfcSE      stat      pvalue
                <numeric>      <numeric> <numeric> <numeric>   <numeric>
GAS5              4358469      0.3663917 0.0342507 10.697334 1.04748e-26
SNORD30           2397520      0.3288890 0.0409048  8.040348 8.95837e-16
SNORD49A          1753234      0.4149990 0.0478249  8.677475 4.04651e-18
SNORD26           1116912      0.2291981 0.0406001  5.645253 1.64939e-08
SNHG1              818057      0.0377487 0.0417886  0.903324 3.66354e-01
...                   ...            ...       ...       ...         ...
ENSG00000275017         0             NA        NA        NA          NA
ENSG00000214669         0             NA        NA        NA          NA
ENSG00000276467         0             NA        NA        NA          NA
MIR6134                 0             NA        NA        NA          NA
ENSG00000270413         0             NA        NA        NA          NA
                       padj
                  <numeric>
GAS5            9.49256e-25
SNORD30         4.16929e-14
SNORD49A        2.25468e-16
SNORD26         4.16947e-07
SNHG1           7.16437e-01
...                     ...
ENSG00000275017          NA
ENSG00000214669          NA
ENSG00000276467          NA
MIR6134                  NA
ENSG00000270413          NA
Code
summary(res_batch3_day2)

out of 50021 with nonzero total read count
adjusted p-value < 0.05
LFC > 0 (up)       : 1163, 2.3%
LFC < 0 (down)     : 2110, 4.2%
outliers [1]       : 6, 0.012%
low counts [2]     : 26000, 52%
(mean count < 2)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results

Log fold change shrinkage

Run log fold change shrinkage to account for low counts.

Code
res_batch3_day2_lfc <- lfcShrink(dds_batch3_day2,
  coef = "cell_line_P1E10_vs_Parental",
  type = "apeglm", parallel = TRUE
)
using 'apeglm' for LFC shrinkage. If used in published research, please cite:
    Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
    sequence count data: removing the noise and preserving large differences.
    Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895

MA Plot (lfc shrunk)

MA Plot of shrunken log2 fold changes

Code
res_batch3_day2_lfc_df <- res_batch3_day2_lfc %>%
  as.data.frame() %>%
  rownames_to_column("gene") %>%
  dplyr::mutate(significant = padj < 0.05)

write_tsv(res_batch3_day2_lfc_df, file = "diffexp_results_batch3_day2.tsv")

res_batch3_day2_lfc_labelled <- res_batch3_day2_lfc_df %>%
  dplyr::filter(gene %in% exogenous_rna_names) %>%
  na.omit()

ggplot(
  res_batch3_day2_lfc_df,
  aes(x = log2(baseMean), y = log2FoldChange, colour = significant)
) +
  geom_point() +
  ggrepel::geom_label_repel(
    data = res_batch3_day2_lfc_labelled,
    aes(label = gene, segment.colour = "black"),
    min.segment.length = 0
  ) +
  theme_bw()
Warning: Removed 6680 rows containing missing values (geom_point).
Warning: ggrepel: 3 unlabeled data points (too many overlaps). Consider
increasing max.overlaps

Counts of exogenous rna

Plot of counts for a single gene (with lowest adjusted p-value)

Code
plot_list <- list()
for (gene in res_batch3_day2_lfc_labelled$gene) {
  
  exogenous_rna <- rna_mixes %>%
    dplyr::filter(rna_species == gene) %>%
    pull(exogenous_rna)
  exogenous_rna_regex <- paste(exogenous_rna, collapse = "|")
  
  d <- plotCounts(dds_batch3_day2,
    gene = gene,
    intgroup = "cell_line",
    returnData = TRUE
  )
  p <- ggplot(
    d %>%
      rownames_to_column("sample_unit") %>%
      dplyr::filter(grepl(exogenous_rna_regex, sample_unit)),
    aes(x = cell_line, y = count)
  ) +
    geom_point(position = position_jitter(width = 0.1, height = 0)) +
    ggtitle(gene)
  plot_list[[gene]] <- p
}
cowplot::plot_grid(plotlist = plot_list, ncol = 2)